capture program drop textreg_train
program textreg_train
	syntax varlist(min=2 max=2) ///
    [using/]                     ///
    [, ///
    Model(string)               ///
    Regularization(string)      ///
    regu_range(string)          ///
    Seed(integer 0)             ///
    TFidf                       ///
    STem                        ///
    stem_lang(string)           ///
    STOPwords(string)           ///
    Ngrams(integer 1)          ///
    min_freq(integer 0)         ///
    max_freq(real 1)            ///
    max_voc(integer 1)  ]       ///
    

    tokenize `varlist'
    local y = "`1'"
    local X = "`2'"

    if `"`model'"' != `"reg"' &  `"`model'"' != `"logit"' { 
		local model = "reg"
	}


    if `"`regularization'"' != `"lasso"' &  `"`regularization'"' != `"ridge"' &  `"`regularization'"' != `"elasticnet"' { 
		local regularization = "ridge"
	}


    if `"`using'"' == `""' { 
		local using = "textreg_model.pkl"
	}  


    if `"`seed'"' == `""' { 
		local seed = "None"
	}

    if `"`stem_lang'"' == `""' { 
		local stem_lang = "english"
	}

    if `"`regu_range'"' == `""' { 
		local regu_range = "0.1,1"
	}



    if `max_voc' == 1 { 
        python: textreg_train(y="`y'", X="`X'", model="`model'" , regularization="`regularization'", regu_range="`regu_range'" , ngrams=`ngrams', model_path="`using'", seed=`seed', tfidf="`tfidf'", stem="`stem'", stem_lang="`stem_lang'", stopwords="`stopwords'", min_freq=`min_freq', max_freq=`max_freq' )
	}    
    else{
        python: textreg_train(y="`y'", X="`X'", model="`model'" , regularization="`regularization'", regu_range="`regu_range'" , ngrams=`ngrams', model_path="`using'", seed=`seed', tfidf="`tfidf'", stem="`stem'", stem_lang="`stem_lang'", stopwords="`stopwords'", min_freq=`min_freq', max_freq=`max_freq', max_voc=`max_voc'  )
    }

end



python

# load necessary python packages
from sfi import Data

import pandas as pd
import numpy as np
import random
from nltk.stem import SnowballStemmer as sns

from sklearn.feature_extraction.text import (CountVectorizer, TfidfVectorizer)
from sklearn.linear_model import (Lasso, Ridge, ElasticNet, LogisticRegression)
from sklearn.metrics import (f1_score, confusion_matrix, r2_score)
from sklearn.model_selection import (train_test_split, StratifiedKFold, KFold)




import pickle

def textreg_train(y, X, model, regularization, regu_range, ngrams, model_path, seed, tfidf, stem, stem_lang, stopwords, min_freq, max_freq, max_voc=None):
 
    print("Step 1/4 :Loading Data from Stata")
    y = Data.get(y) 
    y = pd.Series(y)
    
    X = Data.get(X) 
    X = pd.Series(X)

    random.seed(seed)
    seed1 = random.randint(0,10000)
    seed2 = random.randint(0,10000)
    seed3 = [random.randint(0,10000) for s in range(10)]
    seed4 = random.randint(0,10000)

    stopwords = set(stopwords.lower().split())

    if stem:

        #set stemmer to specified language
        stemmer = sns(stem_lang)

        def stem(string):
            "splits and stemms a string variable and removes stopwords"
            stems = [stemmer.stem(word) for word in string.lower().split() if word not in stopwords]
            return " ".join(stems)

        print("Stemming text")
        X = X.apply(stem)

    
    print("Step 2/4 :Tokenizing Data")
    if tfidf:
        print("Tfidf is used")
        cv = TfidfVectorizer(max_df=float(max_freq), min_df= min_freq, sublinear_tf=True,  use_idf=True, max_features=max_voc, ngram_range=(1,ngrams), stop_words=stopwords)

    else:
        cv = CountVectorizer(max_df=float(max_freq), min_df= min_freq, max_features=max_voc, ngram_range=(1,ngrams), stop_words=stopwords)

    X = cv.fit_transform(X) 


    X_train, X_test, y_train, y_test = train_test_split(X, y.to_numpy(), test_size=0.1, random_state=seed1)

    print("Step 3/4 :Training Model (This may take some time)")
    start, stop = regu_range.split(",")
    regu_range = np.linspace(float(start), float(stop), 10)
    score_list = []
    

    if model=="reg":
        kf = KFold(n_splits=10, shuffle=True, random_state=seed2)
        for fold, ((i_train, i_test), regu) in enumerate( zip( kf.split(X_train), regu_range)):

            if regularization=="lasso":
                reg_model = Lasso(random_state=seed3[fold], alpha=regu)
            elif regularization=="elasticnet":
                reg_model = ElasticNet(random_state=seed3[fold], l1_ratio=0.5, alpha=regu)
            else:
                reg_model = Ridge(random_state=seed3[fold], alpha=regu)

            reg_model.fit(X_train[i_train],y_train[i_train])
            model_score = r2_score(y_train[i_test] , reg_model.predict(X_train[i_test] ) ) 
            print("Score Fold %i (Regularization=%f): %f" %(fold, regu, model_score) )
            score_list.append(model_score)



    elif model=="logit":
        kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed2)
        for fold, ((i_train, i_test), regu) in enumerate( zip( kf.split(X_train, y_train), regu_range)):

            if regularization=="lasso":
                reg_model = LogisticRegression(penalty='l1', solver="saga" ,  random_state=seed3[fold], n_jobs=-1, C=regu)
            elif regularization=="elasticnet":
                reg_model = LogisticRegression(penalty='elasticnet', l1_ratio=0.5,  solver="saga" , random_state=seed3[fold], n_jobs=-1, C=regu)
            else:
                reg_model = LogisticRegression(penalty='l2', solver="saga" ,  random_state=seed3[fold], n_jobs=-1, C=regu)

            reg_model.fit(X_train[i_train],y_train[i_train])
            model_score = f1_score(y_train[i_test] , reg_model.predict(X_train[i_test] ) , average="micro") 
            print("Score Fold %i (Regularization=%f): %f" %(fold, regu, model_score) )
            score_list.append(model_score)


    regularization_best = regu_range[np.argmax(score_list)]

    # fit final model
    if model=="reg":
        if regularization=="lasso":
            reg_model = Lasso(random_state=seed4, alpha=regularization_best)
        elif regularization=="elasticnet":
            reg_model = ElasticNet(random_state=seed4, l1_ratio=0.5, alpha=regularization_best)
        else:
            reg_model = Ridge(random_state=seed4, alpha=regularization_best)

    elif model=="logit":
        if regularization=="lasso":
            reg_model = LogisticRegression(penalty='l1', solver="saga" ,  random_state=seed4, n_jobs=-1, C=regularization_best)
        elif regularization=="elasticnet":
            reg_model = LogisticRegression(penalty='elasticnet', l1_ratio=0.5,  solver="saga" , random_state=seed4, n_jobs=-1, C=regularization_best)
        else:
            reg_model = LogisticRegression(penalty='l2', solver="saga" ,  random_state=seed4, n_jobs=-1, C=regularization_best)

    reg_model.fit(X_train,y_train)

        
    if model=="logit":
        pred = reg_model.predict(X_test)
        score_best = f1_score(y_test, pred, average="micro")
        confmat = confusion_matrix(y_test, pred)
    else:
        score_best = reg_model.score(X_test,y_test)


    print("*************************************************")
    print("*************** Model Parameters ****************")
    print("*************************************************")
    print("Parameters of tokenizer:")
    print(cv)
    print("*************************************************") 
    print("Dimensions of document-n-gram-matrix:")
    print(X.shape)
    print("*************************************************")
    print("Parameters of trained model:")
    print(reg_model)
    print("*************************************************")
    print("Chosen regularization strength:")
    print(regularization_best)
    print("Model Score: %f" %score_best)
    if model=="logit":
        print("Confusion Matrix:")
        print(confmat)
    print("*************************************************")


    print("Step 4/4 :Saving Model")
    with open( model_path , 'wb+') as f:
        pickle.dump(cv, f)
        pickle.dump(reg_model, f)	

		

end









